# download_irrodl_issue.py
# IRRODL (International Review of Research in Open and Distributed Learning) Downloader
# Automates downloading PDFs from IRRODL (OJS-based journal)
# - Crawls issue article pages
# - Extracts /download/ galley links from /view/ URLs
# - Skips Editorials and Book Reviews
# - Creates issue-based folders with sanitized filenames

import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

BASE = "https://www.irrodl.org"
headers = {"User-Agent": "Mozilla/5.0"}

def get_real_pdf_url(view_url):
    r = requests.get(view_url, headers=headers)
    html = r.text

    match = re.search(r'/index\.php/irrodl/article/download/\d+/\d+', html)
    if match:
        url = match.group(0)
        if url.startswith("/"):
            url = BASE + url
        return url

    soup = BeautifulSoup(html, 'html.parser')
    tag = soup.find('a', class_='download', href=True)
    if tag:
        url = tag['href']
        if url.startswith("/"):
            url = BASE + url
        return url

    return None

def sanitize_filename(name):
    name = re.sub(r'[\\/*?:"<>|]', "", name)
    name = re.sub(r'\s+', " ", name)  # remove \n and \t
    return name.strip()[:200]

issue_url = input("Enter IRRODL issue URL: ").strip()
r = requests.get(issue_url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')

raw_title = soup.find("title").get_text()
folder = sanitize_filename("IRRODL_" + raw_title)
os.makedirs(folder, exist_ok=True)

articles = soup.select('.obj_article_summary h3 a')
print(f"🔍 Found {len(articles)} potential articles")

count = 0
for a in articles:
    title = a.get_text(strip=True)
    link = a['href']
    if not link.startswith("http"):
        link = urljoin(BASE, link)

    if "Editorial" in title or "Book Review" in title or "Book Notes" in title:
        print(f"⏩ Skipping: {title}")
        continue

    print(f"[{count+1}] ⬇️ Downloading: {title}")
    pdf_url = get_real_pdf_url(link)
    if not pdf_url:
        print(f"❌ No valid PDF URL for {title}")
        continue

    try:
        pdf = requests.get(pdf_url, headers=headers)
        if "application/pdf" not in pdf.headers.get("Content-Type", ""):
            print(f"❌ Skipped (not PDF): {title} | {pdf_url}")
            continue

        fname = sanitize_filename(title) + ".pdf"
        path = os.path.join(folder, fname)
        with open(path, "wb") as f:
            f.write(pdf.content)
        count += 1
        print(f"✅ Saved: {fname}")

    except Exception as e:
        print(f"❌ Error downloading {title}: {e}")

print(f"\n🎉 All done! {count} PDFs saved into {folder}")
